import os
import pandas as pd
import csv
import numpy as np
import math
from matplotlib.lines import Line2D
import matplotlib as mpl
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
import statsmodels.stats.multitest as sm
This file includes the results of a microbiome analysis performed on samples taken from four individuals that were originally used to determine the “Impact of DNA source on genetic variant detection from human whole-genome sequencing data”.
This included blood, saliva and buccal samples taken from four individuals (blood samples were taken at a different time than saliva and buccal samples). Additionally, a methylation-based enrichment for eukaryotic DNA was performed on the saliva and buccal samples.
Fastq.gz files were downloaded from the ENA database, project accession number PRJNA523344
Kneaddata was used for quality control and removal of human sequences. This included:
- Trimmomatic 0.39: “SLIDINGWINDOW:4:20 MINLEN:50”
- Bowtie2 with the GRCh38_PhiX database (to remove human and PhiX reads): “–fast –dovetail”
#3
#set up colors function (to get up to 120 colors, but with up to 40 unique colors)
def get_cols(num):
colormap_20, colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20', 256), mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
m1, m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_20), mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
colors_20 = [m1.to_rgba(a) for a in range(20)]
colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
if num < 21: return colors_20
elif num < 41: return colors_40
else: return colors_40+colors_40+colors_40
#and colors and shapes for different participants and body sites
colors_dict, shapes_dict = {'Blood':'#900C3F', 'Saliva':'#016F85', 'Buccal':'#ff8300', 'Saliva_euk':'#02aed1', 'Buccal_euk':'#FFC300'}, {'Huref':'o', 'PGPC-0002':'^', 'PGPC-0005':'*', 'PGPC-0006':'s', 'PGPC-0050':'p'}
#4
#get numbers of reads for different steps
reads = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/read_counts.txt', sep='\t', index_col=3, header=0)
participant_dict, site_dict, full_name_dict = {}, {}, {}
samples = list(reads.index.values)
for s in samples:
participant_dict[s] = reads.loc[s, 'Participant']
site_dict[s] = reads.loc[s, 'Body site']
full_name_dict[s] = reads.loc[s, 'Participant']+' '+reads.loc[s, 'Body site']
total_reads = pd.DataFrame(reads.loc[:, 'cat_reads'])
sample_names = [participant_dict[name]+' '+site_dict[name] for name in samples]
colors = [colors_dict[s] for s in list(reads.loc[:, 'Body site'].values)]
shapes = [shapes_dict[s] for s in list(reads.loc[:, 'Participant'].values)]
#5
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Reads kept (%)')
plt.xlim([-0.5,20.5])
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads kept (%)')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()
#6
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.xlim([-0.5,20.5])
plt.ylabel('Reads remaining')
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads remaining')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()
#7
reads_remain = reads.loc[:, ['Percentage', 'cat_reads']].rename(index=full_name_dict, columns = {'cat_reads':'Number'})
#8
py$reads_remain %>%
kable() %>%
kable_styling()
| Percentage | Number | |
|---|---|---|
| Huref Blood | 0.2319067 | 1087311 |
| PGPC-0002 Blood | 0.5748318 | 3321664 |
| PGPC-0005 Blood | 0.5149286 | 1903482 |
| PGPC-0006 Blood | 0.7146067 | 3593441 |
| PGPC-0050 Blood | 0.6870164 | 2940390 |
| PGPC-0002 Saliva | 3.9289510 | 16561343 |
| PGPC-0005 Saliva | 55.6458826 | 244058333 |
| PGPC-0006 Saliva | 13.1608121 | 60497393 |
| PGPC-0050 Saliva | 56.3700720 | 257049004 |
| PGPC-0002 Buccal | 2.8112226 | 11462781 |
| PGPC-0005 Buccal | 2.8202504 | 12751614 |
| PGPC-0006 Buccal | 5.1521560 | 22434503 |
| PGPC-0050 Buccal | 2.4297107 | 10667011 |
| PGPC-0002 Saliva_euk | 1.2333408 | 5345888 |
| PGPC-0005 Saliva_euk | 8.3104464 | 38138382 |
| PGPC-0006 Saliva_euk | 1.9067564 | 8786635 |
| PGPC-0050 Saliva_euk | 5.6187272 | 25037552 |
| PGPC-0002 Buccal_euk | 0.9385856 | 4380036 |
| PGPC-0005 Buccal_euk | 1.2119820 | 5584415 |
| PGPC-0006 Buccal_euk | 1.5567291 | 7232280 |
| PGPC-0050 Buccal_euk | 1.3836141 | 6382288 |
The taxonomy has been profiled using:
1. HUMAnN2
- MetaPhlAn2
2. Kraken2 with Bracken
- GTDB (no confidence parameter set) -
using the database constructed using Struo, release 89 - GTDB (confidence = 0.1)
- Minikraken v1 (no human genome, no confidence parameter set)
- Minikraken v1 (no human genome, confidence = 0.1)
- Minikraken v2 (with human genome, no confidence parameter set)
- Minikraken v2 (with human genome, confidence = 0.1)
- RefSeq Complete v93 (no confidence parameter set)
- RefSeq Complete v93 (confidence = 0.1)
#9
#get the taxonomy file and sort it to strain and genus level
taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/metaphlan_merged.tsv', sep='\t', header=0, index_col=0)
tax_names = list(taxa.index.values)
keeping = []
for a in range(len(tax_names)):
if 't__' in tax_names[a]:
keeping.append(True)
elif 'unclassified' in tax_names[a]:
keeping.append(True)
else:
keeping.append(False)
strain = taxa.loc[keeping, :]
strain_names = list(strain.index.values)
strain_dict = {}
for i in range(len(strain_names)):
strain_dict[strain_names[i]] = strain_names[i].split('|s__')[0].split('|g__')[1]
genus = strain.rename(index=strain_dict)
genus = genus.groupby(by=genus.index, axis=0).sum()
#10
#define the function that calculates the nmds plots
def transform_for_NMDS(df, dist_met='braycurtis'):
X = df.iloc[0:].values
y = df.iloc[:,0].values
seed = np.random.RandomState(seed=3)
X_true = X
similarities = distance.cdist(X_true, X_true, dist_met)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
dissimilarity="precomputed", n_jobs=1)
#print(similarities)
pos = mds.fit(similarities).embedding_
nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
dissimilarity="precomputed", random_state=seed, n_jobs=1,
n_init=1)
npos = nmds.fit_transform(similarities, init=pos)
# Rescale the data
pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
# Rotate the data
clf = PCA()
X_true = clf.fit_transform(X_true)
pos = clf.fit_transform(pos)
npos = clf.fit_transform(npos)
return pos, npos, nmds.stress_
strain_t = strain.transpose()
genus_t = genus.transpose()
handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]
#11
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'braycurtis')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'braycurtis')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
#12
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'euclidean')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'euclidean')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()
#13
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'jaccard')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'jaccard')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()
Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Kingdom level for each sample.
#14
plt.figure(figsize=(7,5))
ax1 = plt.subplot(111)
plt.bar(list(taxa.columns.values), taxa.loc['k__Viruses', :].values, color='#C70039', edgecolor='k')
plt.bar(list(taxa.columns.values), taxa.loc['k__Bacteria', :].values, bottom=taxa.loc['k__Viruses', :].values, color='#026B81', edgecolor='k')
#plt.xticks(list(taxa.columns.values), sample_names, rotation=90)
empty = []
for x in range(0,21):
empty.append('')
ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
handles = [Patch(facecolor='#C70039', edgecolor='k', label='Viruses'), Patch(facecolor='#026B81', edgecolor='k', label='Bacteria')]
plt.legend(handles=handles, bbox_to_anchor=(1,1.05))
plt.tight_layout()
plt.show()
Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Genus level for each sample. Genera with below 1% maximum relative abundance have been removed.
#15
genus = genus[genus.max(axis=1) > 1]
genera = list(genus.index.values)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
gen_colors = get_cols(len(genus.index.values))
handles = []
for g in range(len(genera)):
this_gen = genus.loc[genera[g], :].values
if g == 0:
ax1.bar(list(genus.columns.values), this_gen, color=gen_colors[g], edgecolor='k')
total = this_gen
else:
ax1.bar(list(genus.columns.values), this_gen, bottom=total, color=gen_colors[g], edgecolor='k')
total = this_gen+total
handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=genera[g]))
empty = []
for x in range(0,21):
empty.append('')
ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
plt.legend(handles=handles, bbox_to_anchor=(1,1.05), ncol=2)
plt.tight_layout()
plt.show()
#16
#get all samples into dataframes based on the database that they use
folders = sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2'))
del folders[0]
kraken_columns = {0:'Percent fragments clade', 1:'Number fragments clade', 2:'Number fragments taxon', 3:'Level', 4:'NCBI ID', 5:'Taxon name'}
kraken_all_db, bracken_all_db, all_domains = [], [], {}
for fol in folders:
if fol == 'db_genera':
continue
if not os.path.isdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol):
continue
bracken, kraken, bracken_kreport = [], [], []
bracken_pd, kraken_pd = [], []
for fi in sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol)):
if fi[-7:] == 'bracken':
bracken.append(fi)
elif fi[-7:] == 'kreport' and 'bracken' not in fi:
kraken.append(fi)
elif fi[-7:] == 'kreport':
bracken_kreport.append(fi)
for bk in bracken_kreport:
with open('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+bk, 'rU') as f:
bk = []
domains = {}
this_domain, domain_name = [], ''
for row in csv.reader(f, delimiter='\t'):
bk.append(row)
row[5] = row[5].lstrip()
if row[3] == 'D':
if domain_name != '':
domains[domain_name] = this_domain
this_domain, domain_name = [], row[5]
else:
if row[3] != 'R' and row[3] != 'U' and 'D' not in row[3]:
this_domain.append(row[5])
domains[domain_name] = this_domain
for domain in domains:
if domain in all_domains:
all_domains[domain] = list(set(all_domains[domain]+domains[domain]))
else:
all_domains[domain] = list(set(domains[domain]))
for b in bracken:
if len(b) > 22:
continue
sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+b, sep='\t', header=0, index_col=0)
b = b.replace('_150', '')
sample.drop(['taxonomy_id', 'taxonomy_lvl', 'kraken_assigned_reads', 'added_reads', 'fraction_total_reads'], axis=1, inplace=True)
sample.rename(columns={'new_est_reads':b[:-8]}, inplace=True)
bracken_pd.append(sample)
for k in kraken:
sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+k, sep='\t', header=None, index_col=3)
sample = sample.loc[['U', 'D'], :]
sample = sample.rename(columns=kraken_columns).drop(['Number fragments taxon', 'NCBI ID'], axis=1).rename(columns={'Percent fragments clade':k[:-8]+'_percent', 'Number fragments clade':k[:-8]+'_reads'}).set_index('Taxon name')
taxa = list(sample.index.values)
taxa_dict = {}
for t in taxa:
taxa_dict[t] = t.replace(' ', '')
sample = sample.rename(index=taxa_dict)
kraken_pd.append(sample)
bracken = pd.concat(bracken_pd, join='outer')
kraken = pd.concat(kraken_pd, join='outer')
kraken = kraken.rename(index={'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea'})
kraken = kraken.groupby(by=kraken.index, axis=0).sum()
bracken = bracken.groupby(by=bracken.index, axis=0).sum().fillna(value=0)
kraken_all_db.append(kraken), bracken_all_db.append(bracken)
#17
x1 = [x for x in range(21)]
x2 = [x+0.3 for x in range(21)]
tax_plotting = ['Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'unclassified']
color_plotting = ['#EDBB99', '#5499C7', '#7DCEA0', '#F7DC6F', '#CCD1D1']
tax_paper = ['Bacteria', 'Eukaryota', 'Other', 'Unclassified']
color_paper = ['#5499C7', '#7DCEA0', '#CD6155', '#CCD1D1']
from_paper = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/from_paper.csv', header=0, index_col=0)
def get_summary_reads(kraken_db):
fig = plt.figure(figsize=(15,15))
ax1, ax2, ax3, ax4, ax5 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325)
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
ax5.set_title('From paper')
ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
axs = [ax1, ax2, ax3, ax4, ax5]
for s in range(len(samples)):
if s == 0:
continue
bottom = 0
for t in range(len(tax_paper)):
ax5.bar(x1[s], from_paper.loc[tax_paper[t], samples[s]], bottom=bottom, color=color_paper[t], edgecolor='k', width=0.6)
bottom += from_paper.loc[tax_paper[t], samples[s]]
for db in range(len(kraken_db)):
ax_using = ax_plot[db]
x = x_plot[db]
db = kraken_db[db]
handles = []
for tax in range(len(tax_plotting)):
handles.append(Patch(facecolor=color_plotting[tax], edgecolor='k', label=tax_plotting[tax]))
tax = tax_plotting[tax]
if tax not in list(db.index.values):
db.loc[tax] = [0 for i in range(db.shape[1])]
handles.append(Patch(facecolor=color_paper[2], edgecolor='k', label='Other'))
db = db.fillna(value=0)
for s in range(len(samples)):
bottom = 0
for t in range(len(tax_plotting)):
prop = db.loc[tax_plotting[t], samples[s]+'_reads']
cat = total_reads.loc[samples[s], 'cat_reads']
prop = (prop/cat)*100
ax_using.bar(x[s], prop, bottom=bottom, color=color_plotting[t], edgecolor='k', width=0.3)
bottom += prop
ax2.legend(handles=handles, bbox_to_anchor=(1,1.05))
for ax in axs:
plt.sca(ax)
plt.xticks(x1, ['' for x in x1])
plt.ylim([0, 100])
plt.xlim([-0.5, 20.5])
for x in x1:
ax5.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
ax4.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
ax1.set_ylabel('Classified (%)'), ax3.set_ylabel('Classified (%)'), ax5.set_ylabel('Classified(%)')
#plt.tight_layout()
return
def get_summary_bacteria(kraken_db):
tax_plotting = ['Bacteria']
alpha = ['#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F']
fig = plt.figure(figsize=(10,8))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
axs = [ax1, ax2, ax3, ax4]
for db in range(len(kraken_db)):
ax_using = ax_plot[db]
x = x_plot[db]
alp = alpha[db]
db = kraken_db[db]
for tax in range(len(tax_plotting)):
tax = tax_plotting[tax]
if tax not in list(db.index.values):
db.loc[tax] = [0 for i in range(db.shape[1])]
db = db.fillna(value=0)
for s in range(len(samples)):
bottom = 0
for t in range(len(tax_plotting)):
prop = db.loc[tax_plotting[t], samples[s]+'_reads']
cat = total_reads.loc[samples[s], 'cat_reads']
ax_using.bar(x[s], prop, bottom=bottom, color=alp, edgecolor='k', width=0.3)
bottom += prop
handles = []
handles.append(Patch(facecolor=alpha[0], edgecolor='k', label='No confidence value'))
handles.append(Patch(facecolor=alpha[1], edgecolor='k', label='Confidence=0.1'))
ax2.legend(handles=handles, bbox_to_anchor=(1.6,1.03))
for ax in axs:
plt.sca(ax)
plt.xticks(x1, ['' for x in x1])
plt.semilogy()
plt.xlim([-0.5, 20.5])
#plt.ylim([0, 100])
plt.xlim([-0.5, 20.5])
for x in x1:
pl = ((1/21)*(x+1))-0.02
ax3.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax3.transAxes)
ax4.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax4.transAxes)
ax1.set_ylabel('Number of reads'), ax3.set_ylabel('Number of reads')
#plt.tight_layout()
return
A summary of the percentage of reads classified as different domains with different databases. Note that the ‘From paper’ plot uses the classifications given in the original paper, where 10,000 unmapped reads were classified using BLAST searches of the NCBI database.
#18
get_summary_reads(kraken_all_db)
plt.show()
Summary of the number of reads that are classified as bacteria by each database.
#19
get_summary_bacteria(kraken_all_db)
plt.tight_layout()
plt.show()
These first plots are all separately with the confidence parameter set. See the last tab for those without the confidence parameter set.
#20
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
for db in range(len(bracken_all_db)):
d = int(db)
db = bracken_all_db[db]
species = list(db.index.values)
keeping = []
species_dict = {}
for sp in species:
if sp in bacteria:
keeping.append(True)
new_sp = sp.split('__')
if len(new_sp) > 1:
new_sp = new_sp[1]
else:
new_sp = new_sp[0]
species_dict[sp] = new_sp.split(' ')[0].replace("'", '')
else:
keeping.append(False)
in_len = db.shape[0]
db = db.loc[keeping, :]
strain.append(db)
db = db.rename(index=species_dict)
db = db.groupby(by=db.index, axis=0).sum()
sums = db.sum(axis=0)
gen_sums.append(sums)
db = db.divide(sums, axis=1).multiply(100)
genera.append(db)
db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/'+db_names[d]+'.csv')
gen_names = gen_names+list(db.index.values)
db = db[db.max(axis=1) > 1]
genera_1.append(db)
gen_names_1 = gen_names_1+list(db.index.values)
gen_names = list(set(gen_names))
gen_names_1 = list(set(gen_names_1))
#21
def plot_four_nmds(dbs, metric, name):
fig = plt.figure(figsize=(15,10))
#fig.suptitle(name+metric+'\n\n\n')
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axs = [ax3, ax1, ax2, ax4]
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
for db in range(len(dbs)):
n = db
db = dbs[db].transpose()
pos, npos, stress = transform_for_NMDS(db, metric)
for a in range(len(npos)):
axs[n].scatter(npos[a,0], npos[a,1], marker=shapes[a], color=colors[a], s=100, edgecolor='k')
axs[n].set_xlabel('nMDS1')
axs[n].set_ylabel('nMDS2')
handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
return
#22
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'braycurtis', 'NMDS confidence=0.1 strain ')
plt.show()
#23
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'braycurtis', 'NMDS confidence=0.1 genera ')
plt.show()
#24
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'euclidean', 'NMDS confidence=0.1 strain ')
plt.show()
#25
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'euclidean', 'NMDS confidence=0.1 genera ')
plt.show()
#26
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'jaccard', 'NMDS confidence=0.1 strain ')
plt.show()
#27
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'jaccard', 'NMDS confidence=0.1 genera ')
plt.show()
Bray-curtis distance at strain level
#28
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'braycurtis', 'NMDS no confidence strain ')
plt.show()
Bray-curtis distance at genus level
#29
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'braycurtis', 'NMDS no confidence genera ')
plt.show()
Euclidean distance at strain level
#30
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'euclidean', 'NMDS no confidence strain ')
plt.show()
Euclidean distance at genus level
#31
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'euclidean', 'NMDS no confidence genera ')
plt.show()
Jaccard distance at strain level
#32
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'jaccard', 'NMDS no confidence strain ')
plt.show()
Jaccard distance at genus level
#33
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'jaccard', 'NMDS no confidence genera ')
plt.show()
These plots are now only calculated for the classifications that used confidence = 0.1. Genera with below 1% maximum relative abundance are removed and the numbers in brackets are the number of reads that were classified as bacteria.
#34
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
#bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
#genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
gen_names_1 = sorted(gen_names_1)
def plot_genera(db, sums, tax_cols, gen_names_1, dname):
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
bottom = [0 for x in range(len(db.columns))]
handles = []
for g in range(len(gen_names_1)):
if gen_names_1[g] in db.index.values:
this_row = db.loc[gen_names_1[g], :].values
ax1.bar(x1, this_row, bottom=bottom, color=tax_cols[g], edgecolor='k')
handles.append(Patch(facecolor=tax_cols[g], edgecolor='k', label=gen_names_1[g]))
for b in range(len(bottom)):
bottom[b] += this_row[b]
ax1.legend(handles=handles, bbox_to_anchor=(1, 1.03), ncol=3)
plt.xticks(x1, ['' for x in x1])
plt.ylabel('Relative abundance(%)')
plt.xlim([-0.5, 20.5])
plt.ylim([0, 100])
for x in x1:
n = str(int(sums[samples[x]]))
ax1.text(x, -2, sample_names[x]+' ('+n+')', color=colors[x], rotation=90, va='top', ha='center')
plt.tight_layout()
return
gen_plot = [genera_1[1], genera_1[3], genera_1[5], genera_1[7]]
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
all_sums = [gen_sums[1], gen_sums[3], gen_sums[5], gen_sums[7]]
tax_cols = get_cols(len(gen_names_1))
#35
plot_genera(gen_plot[1], all_sums[1], tax_cols, gen_names_1, db_name[1])
plt.show()
#36
plot_genera(gen_plot[2], all_sums[2], tax_cols, gen_names_1, db_name[2])
plt.show()
#37
plot_genera(gen_plot[0], all_sums[0], tax_cols, gen_names_1, db_name[0])
plt.show()
#38
plot_genera(gen_plot[3], all_sums[3], tax_cols, gen_names_1, db_name[3])
plt.show()
Here I have carried out ANCOM2 tests for differential abundance of genera between body sites. All genera are then plotted on a heatmap with mean values for each body site and stars to denote significant differences as determined by ANCOM.
While many of these results vary between databases, it is worth noting that some differences are consistent, e.g.:
1. Prevotella are always more abundant in saliva samples
2. Streptococcus are always more abundant in buccal samples
3. Klebsiella (where present) are always more abundant in blood samples
#39
def get_differences(new_genus, db_name):
new_genus = new_genus.drop(['SRR8595488'], axis=1)
new_genus = new_genus[new_genus.max(axis=1) > 0.1]
samples = list(new_genus.columns)
list_comparisons = [['Blood', 'Saliva'], ['Blood', 'Buccal'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk'], ['Saliva_euk', 'Buccal_euk']]
comparisons, metadata, comp_len = [], [], []
for a in range(len(list_comparisons)):
keeping = []
this_md = []
for b in range(len(samples)):
if site_dict[samples[b]] in list_comparisons[a]:
keeping.append(True)
this_md.append([samples[b], site_dict[samples[b]]])
else:
keeping.append(False)
this_comp = new_genus.loc[:, keeping]
this_comp = this_comp[this_comp.max(axis=1) > 0.1]
comparisons.append(this_comp)
this_md = pd.DataFrame(this_md, columns=['Samples', 'Groups'])
#this_md.set_index('Samples', inplace=True)
metadata.append(this_md)
comp_len.append(this_comp.shape[0])
new_genus = new_genus.rename(columns=site_dict, inplace=False)
return comparisons, metadata, new_genus, comp_len
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
comparison_names = [r'Blood vs saliva', r'Blood vs buccal', r'Saliva vs buccal', r'Saliva vs saliva_euk', r'Buccal vs buccal_euk', r'Blood vs saliva_euk', r'Blood vs buccal_euk', r'Saliva_euk vs buccal_euk']
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")
get_ancom <- function(ft, md) {
all_ancom = list()
for (a in 1:8){
feature_table = ft[a]
meta_data = md[a]
process = feature_table_pre_process(feature_table, meta_data, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
all_ancom[[a]] <- ancom_out$out
}
return(all_ancom)
}
def sort_ancom_results(r_ancom):
ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06 = [], [], [], []
for a in range(len(r_ancom)):
this_sig_09, this_sig_08, this_sig_07, this_sig_06 = [], [], [], []
r_ancom[a].set_index('taxa_id', inplace=True)
all_sp = list(r_ancom[a].index.values)
for b in range(len(all_sp)):
if r_ancom[a].loc[all_sp[b], 'detected_0.9'] == True:
this_sig_09.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.8'] == True:
this_sig_08.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.7'] == True:
this_sig_07.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.6'] == True:
this_sig_06.append(all_sp[b])
ancom_lists_09.append(this_sig_09), ancom_lists_08.append(this_sig_08), ancom_lists_07.append(this_sig_07), ancom_lists_06.append(this_sig_06)
return [ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06]
def plot_heatmap(new_genus, ANCOM):
print(ANCOM)
names = ['Blood', 'Saliva', 'Buccal', 'Saliva_euk', 'Buccal_euk']
other_names = ['Abundance', r'Blood $vs$ saliva', r'Blood $vs$ buccal', r'Saliva $vs$ buccal', r'Saliva $vs$ saliva_euk', r'Buccal $vs$ buccal_euk', r'Blood $vs$ saliva_euk', r'Blood $vs$ buccal_euk', r'Saliva_euk $vs$ buccal_euk']
colormap, norm = mpl.cm.get_cmap('plasma', 256), mpl.colors.Normalize(vmin=0, vmax=1)
m = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
if list(new_genus.index.values)[0][0] == 'A':
new_genus = new_genus.iloc[::-1]
figure = plt.figure(figsize=(5,new_genus.shape[0]*0.2))
ax1 = plt.subplot(111)
genus_names = list(new_genus.index.values)
y = []
for g in range(len(genus_names)):
this_row = new_genus.loc[genus_names[g], :]
values = [(np.mean(this_row[name].values)) for name in names]
bottom, top, x = [g for a in range(5)], [1 for a in range(5)], [a for a in range(5)]
y.append(g+0.5)
ma = max(values)
values = [v/ma for v in values]
values = [m.to_rgba(v) for v in values]
ax1.bar(x, top, bottom=bottom, color=values, edgecolor='k', width=1)
x.append(5)
ax1.scatter(x[-1], bottom[-1]+0.5, color='k', s=ma*5)
sig, x_plt = [], x[-1]
for a in range(len(ANCOM)):
x_plt += 0.75
if genus_names[g] in ANCOM[a]: ax1.scatter(x_plt, bottom[-1]+0.5, marker='*', color='k')
x.append(x_plt)
plt.ylim([0, len(genus_names)]), plt.xlim([-0.5, x[-1]+0.5])
plt.yticks(y, genus_names)
plt.xticks(x, names+other_names, rotation=90)
ax1.xaxis.set_ticks_position('top')
plt.tight_layout()
plt.show()
return
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 41 | 41 | 37 | 26 | 8 |
| Blood vs buccal | 37 | 27 | 21 | 15 | 3 |
| Saliva vs buccal | 45 | 3 | 3 | 2 | 1 |
| Saliva vs saliva_euk | 41 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 39 | 2 | 0 | 0 | 0 |
| Blood vs saliva_euk | 38 | 32 | 24 | 15 | 3 |
| Blood vs buccal_euk | 34 | 11 | 5 | 2 | 2 |
| Saliva_euk vs buccal_euk | 43 | 5 | 3 | 2 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Burkholderia', 'Enterobacter', 'Klebsiella', 'Paracoccus', 'Pasteurella', 'Prevotella', 'Rothia', 'Staphylococcus'], ['Burkholderia', 'Rothia', 'Streptococcus'], ['Prevotella'], [], [], ['Burkholderia', 'Prevotella', 'Rothia'], ['Rothia', 'Streptococcus'], ['Prevotella']]
##
## <string>:9: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_human_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 49 | 48 | 47 | 38 | 9 |
| Blood vs buccal | 48 | 35 | 16 | 14 | 4 |
| Saliva vs buccal | 39 | 5 | 4 | 1 | 0 |
| Saliva vs saliva_euk | 30 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 37 | 0 | 0 | 0 | 0 |
| Blood vs saliva_euk | 49 | 47 | 33 | 22 | 5 |
| Blood vs buccal_euk | 50 | 19 | 14 | 12 | 4 |
| Saliva_euk vs buccal_euk | 42 | 4 | 3 | 3 | 2 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Capnocytophaga', 'Clostridium', 'Klebsiella', 'Mogibacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptomyces', 'Veillonella'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], [], [], [], ['Campylobacter', 'Klebsiella', 'Prevotella', 'Pseudomonas', 'Streptomyces'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], ['Prevotella', 'Streptococcus']]
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/gtdb_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 98 | 88 | 82 | 32 | 6 |
| Blood vs buccal | 91 | 21 | 14 | 8 | 5 |
| Saliva vs buccal | 96 | 18 | 10 | 2 | 1 |
| Saliva vs saliva_euk | 93 | 3 | 2 | 0 | 0 |
| Buccal vs buccal_euk | 95 | 8 | 5 | 1 | 0 |
| Blood vs saliva_euk | 92 | 27 | 17 | 11 | 4 |
| Blood vs buccal_euk | 75 | 7 | 7 | 3 | 2 |
| Saliva_euk vs buccal_euk | 97 | 19 | 13 | 7 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Mycobacterium', 'Pauljensenia', 'Prevotella', 'Psychrobacter', 'Streptococcus', 'Veillonella'], ['Actinomyces', 'Prevotella', 'Rothia', 'Streptococcus', 'Veillonella'], ['Prevotella'], [], [], ['Acinetobacter', 'Prevotella', 'Streptococcus', 'Veillonella'], ['Acinetobacter', 'Poseidonibacter'], ['Prevotella']]
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/refseq_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 85 | 81 | 77 | 35 | 11 |
| Blood vs buccal | 82 | 65 | 36 | 21 | 7 |
| Saliva vs buccal | 69 | 9 | 6 | 3 | 0 |
| Saliva vs saliva_euk | 64 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 76 | 4 | 4 | 1 | 0 |
| Blood vs saliva_euk | 86 | 72 | 37 | 20 | 4 |
| Blood vs buccal_euk | 79 | 18 | 15 | 8 | 2 |
| Saliva_euk vs buccal_euk | 84 | 12 | 10 | 5 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Bordetella', 'Enterococcus', 'Escherichia', 'Klebsiella', 'Mesorhizobium', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Salmonella', 'Staphylococcus', 'Veillonella'], ['Actinomyces', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptococcus', 'Veillonella'], [], [], [], ['Mycobacterium', 'Prevotella', 'Pseudomonas', 'Veillonella'], ['Actinomyces', 'Veillonella'], ['Prevotella']]